/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

/* TODO:
 * - tweak to test request > sizeof(mcp_ureq_t)
 * - SSE with fence = 1
 * - Altivec on ppc ?
 */

#include "mx_auto_config.h"
#include "myriexpress.h"

#include <stdlib.h>
#if MX_OS_WINNT
#include "getopt.h"
#else
#include <unistd.h>
#endif
#include <assert.h>

#include "mx__lib_types.h"
#include "mx__endpoint.h"
#include "mx__handle_map.h"
#include "mx__mcp_request_ring.h"
#include "mx_stbar.h"
#include "mx_timing.h"

#define DEFAULT_NIC_ID 0UL
#define DEFAULT_EID 0UL
#define DEFAULT_REQLEN 64
#define DEFAULT_EVLEN 64
#define DEFAULT_FENCE 4
#define DEFAULT_ITER (10*1000)
#define DEFAULT_WARMUP 10
#define DEFAULT_USE_SSE 0

#define REQ_ALIGN 16

static void
usage()
{
  fprintf(stderr, "Usage: mx_dmabench [args]\n");
  fprintf(stderr, "-b board_id    - Board number [%ld]\n", DEFAULT_NIC_ID);
  fprintf(stderr, "-e endpoint_id - Endpoint number [%ld]\n", DEFAULT_EID);
  fprintf(stderr, "-R reqlen      - Request length (-1 for all) [%d]\n", DEFAULT_REQLEN);
  fprintf(stderr, "-E evlen       - Event length (-1 for all) [%d]\n", DEFAULT_EVLEN);
  fprintf(stderr, "-F fence       - Number of bytes after the last fence [%d]\n"
		  "                 Negative to write the type afterwards\n", DEFAULT_FENCE);
  fprintf(stderr, "-N iter        - Number of iterations [%d]\n", DEFAULT_ITER);
  fprintf(stderr, "-W warmup iter - Number of warmup iterations [%d]\n", DEFAULT_WARMUP);
  fprintf(stderr, "-S             - Use SSE2 PIO on x86 [%s]\n", DEFAULT_USE_SSE ? "enabled" : "disabled");
  fprintf(stderr, "-v             - verbose\n");
}

#if defined __GNUC__ && (MX_CPU_x86 || MX_CPU_x86_64)

static void
memcpy_sse_nofence(char *to, char *from, int len)
{
  if (len == 4) {
    int dummy;
    __asm__ __volatile__(
			 "mov  (%1), %2\n\t"
			 "mov  %2, (%0)\n\t"
			 "sfence\n\t"
			 ::"r"(to), "r"(from), "r"(dummy) : "memory");
  } else if (len == 8) {
    __asm__ __volatile__(
			 "movq  (%1),%%mm0\n\t"
			 "movq %%mm0, (%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from) : "memory");
  } else if (len == 16) {
    __asm__ __volatile__(
			 "movdqa  (%1),%%xmm0\n\t"
			 "movdqa  %%xmm0, (%0)\n\t"
			 "sfence\n\t"
			 ::"r"(to), "r"(from) : "memory");
  } else if (len == 32) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movdqa  16(%1),%%xmm1\n\t"
			 "movdqa  %%xmm0,   (%0)\n\t"
			 "movdqa  %%xmm1, 16(%0)\n\t"
			 "sfence\n\t"
			 ::"r"(to), "r"(from) : "memory");
  } else if (len == 64) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movdqa  16(%1),%%xmm1\n\t"
			 "movdqa  32(%1),%%xmm2\n\t"
			 "movdqa  48(%1),%%xmm3\n\t"
			 "movdqa  %%xmm0,   (%0)\n\t"
			 "movdqa  %%xmm1, 16(%0)\n\t"
			 "movdqa  %%xmm2, 32(%0)\n\t"
			 "movdqa  %%xmm3, 48(%0)\n\t"
			 "sfence\n\t"
			 ::"r"(to), "r"(from) : "memory");
  } else if (len == 128) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movdqa  16(%1),%%xmm1\n\t"
			 "movdqa  32(%1),%%xmm2\n\t"
			 "movdqa  48(%1),%%xmm3\n\t"
			 "movdqa  64(%1),%%xmm4\n\t"
			 "movdqa  80(%1),%%xmm5\n\t"
			 "movdqa  96(%1),%%xmm6\n\t"
			 "movdqa 112(%1),%%xmm7\n\t"
			 "movdqa  %%xmm0,    (%0)\n\t"
			 "movdqa  %%xmm1,  16(%0)\n\t"
			 "movdqa  %%xmm2,  32(%0)\n\t"
			 "movdqa  %%xmm3,  48(%0)\n\t"
			 "movdqa  %%xmm4,  64(%0)\n\t"
			 "movdqa  %%xmm5,  80(%0)\n\t"
			 "movdqa  %%xmm6,  96(%0)\n\t"
			 "movdqa  %%xmm7, 112(%0)\n\t"
			 "sfence\n\t"
			 ::"r"(to), "r"(from) : "memory");
  }
}

static void
memcpy_sse_fence4(char *to, char *from, int len)
{
  int dummy, dummy2;
  if (len == 8) {
    __asm__ __volatile__(
			 "mov   (%1),%2\n\t"
			 "mov  4(%1),%3\n\t"
			 "mov  %2,  (%0)\n\t"
			 "sfence\n\t"
			 "mov  %3, 4(%0)\n\t"
			 "sfence\n\t"
			 ::"r"(to), "r"(from), "r"(dummy), "r"(dummy2): "memory");
  } else if (len == 16) {
    __asm__ __volatile__(
			 "movq    (%1),%%mm0\n\t"
			 "mov    8(%1),   %2\n\t"
			 "mov   12(%1),   %3\n\t"
			 "movq  %%mm0,  (%0)\n\t"
			 "mov      %2, 8(%0)\n\t"
			 "sfence\n\t"
			 "mov      %3, 12(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from), "r"(dummy), "r"(dummy2): "memory");
  } else if (len == 32) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movq    16(%1), %%mm0\n\t"
			 "mov     24(%1),    %2\n\t"
			 "mov     28(%1),    %3\n\t"
			 "movdqa  %%xmm0,   (%0)\n\t"
			 "movq     %%mm0, 16(%0)\n\t"
			 "mov         %2, 24(%0)\n\t"
			 "sfence\n\t"
			 "mov         %3, 28(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from), "r"(dummy), "r"(dummy2): "memory");
  } else if (len == 64) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movdqa  16(%1),%%xmm1\n\t"
			 "movdqa  32(%1),%%xmm2\n\t"
			 "movq    48(%1), %%mm0\n\t"
			 "mov     56(%1),    %2\n\t"
			 "mov     60(%1),    %3\n\t"
			 "movdqa  %%xmm0,   (%0)\n\t"
			 "movdqa  %%xmm1, 16(%0)\n\t"
			 "movdqa  %%xmm2, 32(%0)\n\t"
			 "movq     %%mm0, 48(%0)\n\t"
			 "mov         %2, 56(%0)\n\t"
			 "sfence\n\t"
			 "mov         %3, 60(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from), "r"(dummy), "r"(dummy2): "memory");
  } else if (len == 128) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movdqa  16(%1),%%xmm1\n\t"
			 "movdqa  32(%1),%%xmm2\n\t"
			 "movdqa  48(%1),%%xmm3\n\t"
			 "movdqa  64(%1),%%xmm4\n\t"
			 "movdqa  80(%1),%%xmm5\n\t"
			 "movdqa  96(%1),%%xmm6\n\t"
			 "movq   112(%1), %%mm0\n\t"
			 "mov    120(%1),    %2\n\t"
			 "mov    124(%1),    %3\n\t"
			 "movdqa  %%xmm0,    (%0)\n\t"
			 "movdqa  %%xmm1,  16(%0)\n\t"
			 "movdqa  %%xmm2,  32(%0)\n\t"
			 "movdqa  %%xmm3,  48(%0)\n\t"
			 "movdqa  %%xmm4,  64(%0)\n\t"
			 "movdqa  %%xmm5,  80(%0)\n\t"
			 "movdqa  %%xmm6,  96(%0)\n\t"
			 "movq     %%mm0, 112(%0)\n\t"
			 "mov         %2, 120(%0)\n\t"
			 "sfence\n\t"
			 "mov         %3, 124(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from), "r"(dummy), "r"(dummy2): "memory");
  }
}

static void
memcpy_sse_fence8(char *to, char *from, int len)
{
  if (len == 16) {
    __asm__ __volatile__(
			 "movq    (%1), %%mm0\n\t"
			 "movq   8(%1), %%mm1\n\t"
			 "movq   %%mm0,  (%0)\n\t"

			 "sfence\n\t"
			 "movq   %%mm1, 8(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from) : "memory");
  } else if (len == 32) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movq    16(%1), %%mm0\n\t"
			 "movq    24(%1), %%mm1\n\t"
			 "movdqa  %%xmm0,   (%0)\n\t"
			 "movq     %%mm0, 16(%0)\n\t"
			 "sfence\n\t"
			 "movq     %%mm1, 24(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from) : "memory");
  } else if (len == 64) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movdqa  16(%1),%%xmm1\n\t"
			 "movdqa  32(%1),%%xmm2\n\t"
			 "movq    48(%1), %%mm0\n\t"
			 "movq    56(%1), %%mm1\n\t"
			 "movdqa  %%xmm0,   (%0)\n\t"
			 "movdqa  %%xmm1, 16(%0)\n\t"
			 "movdqa  %%xmm2, 32(%0)\n\t"
			 "movq     %%mm0, 48(%0)\n\t"
			 "sfence\n\t"
			 "movq     %%mm1, 56(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from) : "memory");
  } else if (len == 128) {
    __asm__ __volatile__(
			 "movdqa    (%1),%%xmm0\n\t"
			 "movdqa  16(%1),%%xmm1\n\t"
			 "movdqa  32(%1),%%xmm2\n\t"
			 "movdqa  48(%1),%%xmm3\n\t"
			 "movdqa  64(%1),%%xmm4\n\t"
			 "movdqa  80(%1),%%xmm5\n\t"
			 "movdqa  96(%1),%%xmm6\n\t"
			 "movq   112(%1), %%mm0\n\t"
			 "movq   120(%1), %%mm1\n\t"
			 "movdqa  %%xmm0,    (%0)\n\t"
			 "movdqa  %%xmm1,  16(%0)\n\t"
			 "movdqa  %%xmm2,  32(%0)\n\t"
			 "movdqa  %%xmm3,  48(%0)\n\t"
			 "movdqa  %%xmm4,  64(%0)\n\t"
			 "movdqa  %%xmm5,  80(%0)\n\t"
			 "movdqa  %%xmm6,  96(%0)\n\t"
			 "movq     %%mm0, 112(%0)\n\t"
			 "sfence\n\t"
			 "movq     %%mm1, 120(%0)\n\t"
			 "sfence\n\t"
			 "emms\n\t"
			 ::"r"(to), "r"(from) : "memory");
  }
}

#define FORCE_NO_SSE 0
#else /* SSE not implemented */
#define FORCE_NO_SSE 1
#endif /* SSE not implemented */

static inline void
submit_and_wait(mcp_ureq_t * batch, mcp_ureq_t *ureq, int reqlen, int fence, int use_sse,
		mcp_uevt_t * mcp_event,
		mx_cycles_t * cycles_start, mx_cycles_t * cycles_stop)
{
  volatile uint8_t type;
  int offset_in_slot = (sizeof(mcp_ureq_t) - reqlen) % sizeof(mcp_ureq_t);
  /* Warning: does not work if reqlen > sizeof(mcp_ureq_t) and not a multiple of sizeof(mcp_ureq_t) */

  *cycles_start = mx_get_cycles();

  /* post request */
  if (use_sse) {
#if !FORCE_NO_SSE
    if (fence > 0 && fence < reqlen) {
      /* write the whole request with a fence before the type */
      if (fence == 4)
	memcpy_sse_fence4(((char*)ureq) + offset_in_slot,
			  ((char*)batch) + offset_in_slot,
			  reqlen);
      else if (fence == 8)
	memcpy_sse_fence8(((char*)ureq) + offset_in_slot,
			  ((char*)batch) + offset_in_slot,
			  reqlen);

    } else if (fence < 0 && -fence < reqlen) {
      /* write the whole request at once without the ending type, and rewrite the type afterwars */

      /* save -fence bytes from the end of the batch, and replace with 0 */
      char after[8]; /* fence is -4 or -8 */
      memcpy(after, ((char*)batch) + offset_in_slot + reqlen + fence, -fence);
      memset(((char*)batch) + offset_in_slot + reqlen + fence, 0, -fence);

      /* write the whole request at once */
      memcpy_sse_nofence(((char*)ureq) + offset_in_slot,
			 ((char*)batch) + offset_in_slot,
			 reqlen);

      /* write the actual -fence bytes from the end of the batch that we saved previously */
      memcpy_sse_nofence(((char*)ureq) + offset_in_slot + reqlen + fence, after, -fence);

      /* restore the request end */
      memcpy(((char*)batch) + offset_in_slot + reqlen + fence, after, -fence);

    } else {
      /* no need to fence ever */

      /* write the whole request at once */
      memcpy_sse_nofence(((char*)ureq) + offset_in_slot,
			 ((char*)batch) + offset_in_slot,
			 reqlen);
    }
#endif /* FORCE_NO_SSE */

  } else {
    if (fence > 0 && fence < reqlen) {
      /* write the whole request except the type */
      memcpy(((char*)ureq) + offset_in_slot,
	     ((char*)batch) + offset_in_slot,
	     reqlen - fence);
      MX_STBAR();

      /* write the type */
      memcpy(((char*)ureq) + offset_in_slot + reqlen - fence,
	     ((char*)batch) + offset_in_slot + reqlen - fence,
	     fence);
      MX_STBAR();

    } else if (fence < 0 && -fence < reqlen) {
      /* write the whole request at once without the ending type, and rewrite the type afterwars */

      /* save -fence bytes from the end of the batch, and replace with 0 */
      char after[8]; /* fence is -4 or -8 */
      memcpy(after, ((char*)batch) + offset_in_slot + reqlen + fence, -fence);
      memset(((char*)batch) + offset_in_slot + reqlen + fence, 0, -fence);

      /* write the whole request at once */
      memcpy(((char*)ureq) + offset_in_slot,
	     ((char*)batch) + offset_in_slot,
	     reqlen);
      MX_STBAR();

      /* write the actual -fence bytes from the end of the batch that we saved previously */
      memcpy(((char*)ureq) + offset_in_slot + reqlen + fence, after, -fence);
      MX_STBAR();

      /* restore the request end */
      memcpy(((char*)batch) + offset_in_slot + reqlen + fence, after, -fence);

    } else {
      /* no need to fence ever */

      /* write the whole request at once */
      memcpy(((char*)ureq) + offset_in_slot,
	     ((char*)batch) + offset_in_slot,
	     reqlen);
      MX_STBAR();
    }
  }

  /* wait for completion */
  do {
    MX_READBAR();
    type = mcp_event->basic.type;
  } while (!type);

  *cycles_stop = mx_get_cycles();

  assert(type == MX_MCP_UEVT_PIO_BENCH);
}


static void
run_test_loop(struct mx_endpoint * ep,
	      int reqlen, int evlen, int fence, int use_sse,
	      mx_cycles_t *cycles, int iter, int warmup, int verbose)
{
  int i;

  /* the actual request that will be pio'ed, aligned in batch_buffer */
  char * batch_buffer;
  mcp_ureq_t *batch;
  mcp_ureq_piobench_t *piobench;
  int slots_per_req = (reqlen+sizeof(mcp_ureq_t)-1)/sizeof(mcp_ureq_t);

  if ((ep->is_ze) && (reqlen != 64)) {
    printf("Request size must be 64 Bytes for 10G NICs (for now)\n");
    return;
  }
  
  /* get a batch request */
  batch_buffer = calloc(1, reqlen + sizeof(mcp_ureq_t) + REQ_ALIGN); /* too lazy to use a min() */
  batch = (void*)(((uintptr_t)batch_buffer+REQ_ALIGN-1) & (~(REQ_ALIGN-1)));

  /* prefill the last slot */
  piobench = (mcp_ureq_piobench_t*)(batch + slots_per_req - 1);
  piobench->type = MX_MCP_UREQ_PIOBENCH;
  piobench->length = evlen-1;

  for(i=0; i<iter + warmup; i++) {
    uint16_t mcp_handle;
    mcp_ureq_t *ureq;
    mcp_uevt_t * mcp_event;
    mx_cycles_t cycles_start, cycles_stop;

    /* get a mcp request */
    mcp_handle = mx__endpoint_alloc_mcp_handle(ep);
    ureq = MX__UREQ(ep, mcp_handle);
    
    if (ep->is_ze)
      ureq = (mcp_ureq_t *)((char *) ureq + (MX_MCP_UREQ_PIOBENCH << 8));

    /* get the next slot */
    mcp_event = (mcp_uevt_t *) ep->eventq_uevt;

    if (slots_per_req > 1) {
      /* if we test PIO larger than a request, we need to avoid the first slots
       * so that we may safely overwrite the previous slots. */
      assert(ep->is_ze == 0);
      if (ep->req_ring->base > ureq + 1 - slots_per_req) {
	/* consume a small request to use this slot and skip to the next request slot */
	submit_and_wait((mcp_ureq_t*)piobench, ureq, 4, fence,
			0, mcp_event, &cycles_start, &cycles_stop);
 	i--;
	goto skip_result;
      } else {
	/* start to write on the previous slots */
	ureq -= slots_per_req - 1;
      }
    }

    submit_and_wait(batch, ureq, reqlen, fence, use_sse, mcp_event, &cycles_start, &cycles_stop);
    if (i < warmup)
      goto skip_result;

    cycles[i-warmup] = cycles_stop - cycles_start;
    if (verbose)
      printf("%lld (from %lld to %lld)\n", (unsigned long long) cycles[i-warmup],
	     (unsigned long long) cycles_start, (unsigned long long) cycles_stop);

   skip_result:
    /* prepare next event */
    mx__endpoint_free_mcp_handle(ep, mcp_handle);
    mcp_event->basic.type = 0;
    ep->eventq_uevt++;
    ep->eventq_index++;
    ep->eventq_flow++;
    ep->event_count++;
    if (ep->eventq_index == (ep->eventq_length / sizeof (mcp_uevt_t))) {
      ep->eventq_uevt = (mcp_uevt_t *) ep->eventq;
      ep->eventq_index = 0;
    }
  }

  free(batch_buffer);
}

int cmp_mx_cycles(const void *a, const void *b)
{
  return *(mx_cycles_t*)b - *(mx_cycles_t*)a;
}

static int
run_test(struct mx_endpoint * ep, int reqlen, int evlen, int fence, int use_sse,
	 int iter, int warmup, int verbose)
{
  int reqlen_start = 4, reqlen_stop = 128;
  int evlen_start = 4, evlen_stop = 64;
  mx_cycles_t * cycles;

  if (reqlen != -1) {
    if (reqlen > 128 || reqlen < 4 || (reqlen%4) ) {
      fprintf(stderr, "reqlen must be a multiple of 4 between 4 and 128, or -1 for all\n");
      return -1;
    }
    reqlen_start = reqlen_stop = reqlen;
  }

  if (evlen != -1) {
    if (evlen > 64 || evlen < 4) {
      fprintf(stderr, "evlen must be between 4 and 64, or -1 for all\n");
      return -1;
    }
    evlen_start = evlen_stop = evlen;
  }

  if (fence && fence != 4 && fence != 8 && fence != -4 && fence != -8) {
    fprintf(stderr, "fence must be 0 (no fence) or +/-4 or +/-8\n");
    return -1;
  }

  if (iter <= 0)
    return 0;

  cycles = malloc(sizeof(mx_cycles_t) * iter);
  assert(cycles != NULL);

  for(evlen = evlen_start; evlen <= evlen_stop; evlen <<= 1)
    for(reqlen = reqlen_start; reqlen <= reqlen_stop; reqlen <<= 1) {
      int i;
      mx_cycles_t total_cycles, min_cycles, max_cycles;
      int mean_ns, min_ns, max_ns, median_ns;

      run_test_loop(ep, reqlen, evlen, fence, use_sse, cycles, iter, warmup, verbose);

      /* sort to get the median */
      qsort(cycles, iter, sizeof(mx_cycles_t), cmp_mx_cycles);

      /* compute the total/max/min */
      total_cycles = min_cycles = max_cycles = cycles[0];
      for(i=1; i<iter; i++) {
	min_cycles = cycles[i] < min_cycles ? cycles[i] : min_cycles;
	max_cycles = cycles[i] > max_cycles ? cycles[i] : max_cycles;
	total_cycles += cycles[i];
      }
      /* get corresponding timings */
      mean_ns = (int)(((double)total_cycles)/iter*mx_seconds_per_cycle()*1000*1000*1000);
      min_ns = (int)(((double)min_cycles)*mx_seconds_per_cycle()*1000*1000*1000);
      max_ns = (int)(((double)max_cycles)*mx_seconds_per_cycle()*1000*1000*1000);
      median_ns = (int)(((double)cycles[iter/2] + (double)cycles[(iter+1)/2])/2*mx_seconds_per_cycle()*1000*1000*1000);

      printf("req % 4d ev % 3d fence %d%s: mean % 5d ns, min % 5d ns, max % 7d ns med % 5d ns (%d iter)\n",
	     reqlen, evlen, fence, use_sse ? " (SSE)" : "",
	     mean_ns, min_ns, max_ns, median_ns, iter);
    }

  free(cycles);

  return 0;
}

int
main(int argc, char **argv)
{
  mx_endpoint_t ep;
  mx_return_t ret;
  int c;
  uint32_t nic_id = DEFAULT_NIC_ID;
  uint32_t eid = DEFAULT_EID;
  int reqlen = DEFAULT_REQLEN;
  int evlen = DEFAULT_EVLEN;
  int fence = DEFAULT_FENCE;
  int iter = DEFAULT_ITER;
  int warmup = DEFAULT_WARMUP;
  int use_sse = DEFAULT_USE_SSE;
  int verbose = 0;

  while ((c = getopt(argc, argv, "b:e:R:E:F:N:W:Svh")) != EOF) switch(c) {
  case 'b':
    nic_id = atoi(optarg);
    break;
  case 'e':
    eid = atoi(optarg);
    break;
  case 'R':
    reqlen = atoi(optarg);
    break;
  case 'E':
    evlen = atoi(optarg);
    break;
  case 'F':
    fence = atoi(optarg);
    break;
  case 'N':
    iter = atoi(optarg);
    break;
  case 'W':
    warmup = atoi(optarg);
    break;
  case 'S':
#if !FORCE_NO_SSE
    use_sse = 1;
#else /* FORCE_NO_SSE */
    fprintf(stderr, "SSE not supported\n");
#endif /* FORCE_NO_SSE */
    break;
  case 'v':
    verbose = 1;
    break;
  default:
    usage();
    exit(1);
  }

  ret = mx_init();
  assert(ret == MX_SUCCESS);
  ret = mx_open_endpoint(nic_id, eid, 0, NULL, 0, &ep);
  assert(ret == MX_SUCCESS);

  if (fence < 0 && ep->is_ze) {
    fprintf(stderr, "Negative fence disabled on ZE boards\n");
    exit(1);
  }

  MX__MUTEX_LOCK(&ep->lock);

  run_test(ep, reqlen, evlen, fence, use_sse, iter, warmup, verbose);

  MX__MUTEX_UNLOCK(&ep->lock);

  mx_close_endpoint(ep);
  mx_finalize();

  return 0;
}
